Downloaded file RNA-Seq Gencode v3c summarized to genes containing ‘normalized RPKM expression values employing a historical normalization method’
# Load csvs (Downloaded from 'RNA-Seq Gencode v10 summarized to genes' in http://www.brainspan.org/static/download.html)
datExpr = read.csv('./../Data/BrainSpan/expression_matrix.csv', header=FALSE)
datMeta = read.csv('./../Data/BrainSpan/columns_metadata.csv')
geneInfo = read.csv('./../Data/BrainSpan/rows_metadata.csv')
# Remove index column in datExpr
cols = datExpr %>% colnames
datExpr = datExpr %>% dplyr::select(-V1)
colnames(datExpr) = cols[-length(cols)]
# Make sure rows match
if(!all(rownames(datExpr) == geneInfo$row_num)){
stop('Columns in datExpr don\'t match the rows in datMeta!')
}
# Assign row names
rownames(datMeta) = paste0('V', datMeta$column_num)
rownames(datExpr) = geneInfo$ensembl_gene_id
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$structure_acronym)
datMeta$Brain_lobe = 'Other'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('Ocx','V1C')] = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('M1C-S1C','DFC','OFC','VFC','M1C')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('PCx','IPC', 'S1C')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('AMY','MGE','STC','ITC','HIP','TCx','A1C')] = 'Temporal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('MFC')] = 'Limbic'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital','Limbic','Other'))
# Create complementary age variables
datMeta = datMeta %>% mutate('age_stage' = gsub('.* ','',age))
datMeta = datMeta %>% mutate('age_numeric' = case_when(grepl('pcw', age) ~ as.numeric(gsub(' pcw','',age))/52,
grepl('mos', age) ~ as.numeric(gsub(' mos','',age))/12,
TRUE ~ as.numeric(gsub(' yrs','',age)) ))
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')
## Parsed with column specification:
## cols(
## status = col_double(),
## `gene-symbol` = col_character(),
## `gene-name` = col_character(),
## chromosome = col_character(),
## `genetic-category` = col_character(),
## `gene-score` = col_double(),
## syndromic = col_double(),
## `number-of-reports` = col_double(),
## ID = col_character()
## )
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
# Combine SFARI and GO information
gene_info = data.frame('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))
rm(cols, getinfo, mart, GO_annotations, geneInfo)
Remove faulty genes
to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
gene_info = gene_info[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
RNA-Seq for 524 cortical brain-tissue samples across 26 brain regions belonging to 42 subjects
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$donor_id)), ' different subjects.'))
## [1] "Dataset includes 49089 genes from 524 samples belonging to 42 different subjects."
Brain region distribution:
table(datMeta$Brain_Region)
##
## A1C AMY CB CBC CGE DFC DTH HIP IPC
## 31 33 3 29 2 35 5 32 33
## ITC LGE M1C M1C-S1C MD MFC MGE Ocx OFC
## 34 2 26 5 24 32 2 2 31
## PCx S1C STC STR TCx URL V1C VFC
## 2 26 36 28 1 2 33 35
Grouped by lobe:
table(datMeta$Brain_lobe)
##
## Frontal Temporal Parietal Occipital Limbic Other
## 132 169 61 35 32 95
Sex distribution: Fairly balanced
table(datMeta$gender)
##
## F M
## 226 298
Age distribution:
summary(datMeta$age)
## 1 yrs 10 mos 11 yrs 12 pcw 13 pcw 13 yrs 15 yrs 16 pcw 17 pcw 18 yrs
## 16 10 14 45 44 16 5 39 14 13
## 19 pcw 19 yrs 2 yrs 21 pcw 21 yrs 23 yrs 24 pcw 25 pcw 26 pcw 3 yrs
## 11 16 12 16 16 14 16 1 3 25
## 30 yrs 35 pcw 36 yrs 37 pcw 37 yrs 4 mos 4 yrs 40 yrs 8 pcw 8 yrs
## 16 2 16 16 16 33 7 15 16 27
## 9 pcw
## 14
There is a big concentration of genes with very low values and a threshold of 0.5 seems to separate them from the rest of the data.
mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))
mean_expr %>% ggplot(aes(Mean+1)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) +
geom_vline(xintercept=1.5, color='gray', linetype='dashed') + scale_x_log10() + theme_minimal()
rm(mean_expr)
Filtering out genes with a mean expression lower than 0.5
to_keep = rowMeans(datExpr, na.rm=TRUE)>0.5
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep, na.rm=T), ' genes, ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 30253 genes, 18836 remaining"
rm(to_keep)
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Using \(s_{ij}=|bw(i,j)|\) to define connectivity between genes.
Filtering three samples, all from different subjects but from the Harvard lab
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'brainLobe'=datMeta$Brain_lobe)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=brainLobe)) + geom_point() +
geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: 1, 9, 10, 11, 14, 23, 28, 116, 117, 177, 227, 237, 277, 297, 298, 299, 300, 301, 302, 303, 305, 306, 307, 308, 337, 338, 378, 509, 516, 522"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 30 samples, 494 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 18836 genes and 494 samples"
Log transforming the data to help visualisations
datExpr_transformed = log2(datExpr+1)
PCA: The main recognisable pattern separates the samples by age, with the biggest separation between preconception and postconception samples. Brain lobe doesn’t seem to be an important factor
pca = datExpr_transformed %>% t %>% prcomp
plot_data = data.frame('ID'=as.numeric(gsub('V','',colnames(datExpr))), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by=c('ID'='column_num')) %>%
dplyr::select('PC1','PC2','age_numeric','age_stage','Brain_lobe','gender')
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)
First Principal Component explains 81% of the total variance
There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component
pca = datExpr_transformed %>% prcomp
plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
'MeanExpr'=rowMeans(datExpr_transformed), 'SD'=apply(datExpr_transformed,1,sd))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
The higher the SFARI score the higher the mean expression, and probably the higher the standard deviation, although that relation is not as clear
plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(gene_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
The higher the SFARI score the higher the mean expression, and probably the lower the standard deviation as well, although that relation is not as clear, again.
plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr_transformed)),
'MeanExpr'=rowMeans(datExpr_transformed),
'SDExpr'=apply(datExpr_transformed,1,sd)) %>%
left_join(gene_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
save(datExpr, datMeta, datGenes, gene_info, file='./../Data/BrainSpan/cleaned_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] glue_1.3.1 Rtsne_0.15 vsn_3.50.0
## [4] Biobase_2.42.0 BiocGenerics_0.28.0 WGCNA_1.66
## [7] fastcluster_1.1.25 dynamicTreeCut_1.63-1 forcats_0.3.0
## [10] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.1
## [13] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1
## [16] tidyverse_1.2.1 viridis_0.5.1 viridisLite_0.3.0
## [19] gridExtra_2.3 plotlyutils_0.0.0.9000 plotly_4.8.0
## [22] ggplot2_3.1.0 biomaRt_2.38.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.11.2 base64enc_0.1-3
## [4] rstudioapi_0.7 affyio_1.52.0 bit64_0.9-7
## [7] AnnotationDbi_1.44.0 mvtnorm_1.0-7 lubridate_1.7.4
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.2
## [13] doParallel_1.0.14 impute_1.56.0 robustbase_0.93-0
## [16] knitr_1.22 Formula_1.2-3 jsonlite_1.6
## [19] Cairo_1.5-9 broom_0.5.1 cluster_2.0.7-1
## [22] GO.db_3.7.0 shiny_1.2.0 BiocManager_1.30.4
## [25] rrcov_1.4-3 compiler_3.5.2 httr_1.4.0
## [28] backports_1.1.2 assertthat_0.2.1 Matrix_1.2-15
## [31] lazyeval_0.2.2 limma_3.38.3 cli_1.1.0
## [34] later_0.8.0 acepack_1.4.1 htmltools_0.3.6
## [37] prettyunits_1.0.2 tools_3.5.2 gtable_0.2.0
## [40] affy_1.60.0 Rcpp_1.0.1 cellranger_1.1.0
## [43] preprocessCore_1.44.0 nlme_3.1-137 crosstalk_1.0.0
## [46] iterators_1.0.9 xfun_0.5 rvest_0.3.2
## [49] mime_0.6 XML_3.98-1.11 DEoptimR_1.0-8
## [52] MASS_7.3-51.1 zlibbioc_1.28.0 scales_1.0.0
## [55] promises_1.0.1 hms_0.4.2 RColorBrewer_1.1-2
## [58] curl_3.3 yaml_2.2.0 memoise_1.1.0
## [61] rpart_4.1-13 latticeExtra_0.6-28 stringi_1.4.3
## [64] RSQLite_2.1.1 S4Vectors_0.20.1 pcaPP_1.9-73
## [67] foreach_1.4.4 checkmate_1.8.5 rlang_0.3.2
## [70] pkgconfig_2.0.2 matrixStats_0.54.0 bitops_1.0-6
## [73] evaluate_0.13 lattice_0.20-38 labeling_0.3
## [76] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [79] robust_0.4-18 plyr_1.8.4 magrittr_1.5
## [82] R6_2.4.0 IRanges_2.16.0 generics_0.0.2
## [85] Hmisc_4.1-1 fit.models_0.5-14 DBI_1.0.0
## [88] pillar_1.3.1 haven_1.1.1 foreign_0.8-71
## [91] withr_2.1.2 survival_2.43-3 RCurl_1.95-4.10
## [94] nnet_7.3-12 modelr_0.1.4 crayon_1.3.4
## [97] rmarkdown_1.12 progress_1.2.0 grid_3.5.2
## [100] readxl_1.1.0 data.table_1.12.0 blob_1.1.1
## [103] digest_0.6.18 xtable_1.8-3 httpuv_1.5.0
## [106] stats4_3.5.2 munsell_0.5.0